Assignment

This assignment is aim to solve the problems of for Mini Challenge 2

LI NAN https://www.linkedin.com/in/li-nan-63b9251a6/
07-12-2021

1.Data Preparation

1.1 Global Settings

The global settings of R code chunks in this post is set as follows.

1.2 R Packages Installation

The following code input is to prepare for R Packages Installation.

packages = c('raster','sf','tmap', 'clock','DT', 'ggiraph', 'plotly', 'tidyverse','dplyr','readr','hrbrthemes','tmap')
for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

1.3 Data Import

The following code is to import raw data sets from Mini Challenge2(“car-assignment.csv”,“cc_data.csv”,“gps.csv”,“loyalty_data.csv”).

credit_debit <- read_csv("data/cc_data.csv")
loyalty_data <- read_csv("data/loyalty_data.csv")
car_assignment <- read_csv("data/car_assignments.csv")
GPS <- read_csv("data/gps.csv")
glimpse(credit_debit)
Rows: 1,490
Columns: 4
$ timestamp  <chr> "1/6/2014 7:28", "1/6/2014 7:34", "1/6/2014 7:35"~
$ location   <chr> "Brew've Been Served", "Hallowed Grounds", "Brew'~
$ price      <dbl> 11.34, 52.22, 8.33, 16.72, 4.24, 4.17, 28.73, 9.6~
$ last4ccnum <dbl> 4795, 7108, 6816, 9617, 7384, 5368, 7253, 4948, 9~
glimpse(loyalty_data)
Rows: 1,392
Columns: 4
$ timestamp  <chr> "1/8/2014", "1/8/2014", "1/14/2014", "1/9/2014", ~
$ location   <chr> "Carlyle Chemical Inc.", "Carlyle Chemical Inc.",~
$ price      <dbl> 4983.52, 4901.88, 4898.39, 4792.50, 4788.22, 4742~
$ loyaltynum <chr> "L8477", "L5756", "L2769", "L3317", "L8477", "L57~
glimpse(GPS)
Rows: 685,169
Columns: 4
$ Timestamp <chr> "01/06/2014 06:28:01", "01/06/2014 06:28:01", "01/~
$ id        <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat       <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long      <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
head(loyalty_data)
# A tibble: 6 x 4
  timestamp location               price loyaltynum
  <chr>     <chr>                  <dbl> <chr>     
1 1/8/2014  Carlyle Chemical Inc.  4984. L8477     
2 1/8/2014  Carlyle Chemical Inc.  4902. L5756     
3 1/14/2014 Abila Airport          4898. L2769     
4 1/9/2014  Abila Airport          4792. L3317     
5 1/15/2014 Maximum Iron and Steel 4788. L8477     
6 1/16/2014 Nationwide Refinery    4743. L5756     
head(credit_debit)
# A tibble: 6 x 4
  timestamp     location            price last4ccnum
  <chr>         <chr>               <dbl>      <dbl>
1 1/6/2014 7:28 Brew've Been Served 11.3        4795
2 1/6/2014 7:34 Hallowed Grounds    52.2        7108
3 1/6/2014 7:35 Brew've Been Served  8.33       6816
4 1/6/2014 7:36 Hallowed Grounds    16.7        9617
5 1/6/2014 7:37 Brew've Been Served  4.24       7384
6 1/6/2014 7:38 Brew've Been Served  4.17       5368

2.Tasks and Questions for Mini-Challenge2

2.1 Q1 Intruoduction

Using just the credit and loyalty card data, identify the most popular locations, and when they are popular. What anomalies do you see? What corrections would you recommend to correct these anomalies?

2.1.1 Data Preparation for Q1

Comparison of total amount between credit/debit card and loyalty card

After glimpsing data structure of credit and loyalty card data, the heat map is a good way to visualize the most population locations and its population time.To create this graph,the data aggregation of loyalty card is needed.

loyalty_data$count_event=1
credit_debit$count_event=1
head(loyalty_data)
# A tibble: 6 x 5
  timestamp location               price loyaltynum count_event
  <chr>     <chr>                  <dbl> <chr>            <dbl>
1 1/8/2014  Carlyle Chemical Inc.  4984. L8477                1
2 1/8/2014  Carlyle Chemical Inc.  4902. L5756                1
3 1/14/2014 Abila Airport          4898. L2769                1
4 1/9/2014  Abila Airport          4792. L3317                1
5 1/15/2014 Maximum Iron and Steel 4788. L8477                1
6 1/16/2014 Nationwide Refinery    4743. L5756                1
aggregate_dataset <- loyalty_data %>% 
    group_by(timestamp,location) %>% 
    dplyr::summarize(Frequency = sum(count_event),Money_loyalty=sum(price))
head(aggregate_dataset)
# A tibble: 6 x 4
# Groups:   timestamp [1]
  timestamp location               Frequency Money_loyalty
  <chr>     <chr>                      <dbl>         <dbl>
1 1/10/2014 Abila Zacharo                  7         171. 
2 1/10/2014 Albert's Fine Clothing         1         126. 
3 1/10/2014 Bean There Done That           5          60.7
4 1/10/2014 Brew've Been Served           14         132. 
5 1/10/2014 Brewed Awakenings              3          33.9
6 1/10/2014 Carlyle Chemical Inc.          2        3717. 
credit_debit$timestamp <- as.Date(credit_debit$timestamp, "%m/%d/%Y")
aggregate_cc <- credit_debit %>% 
    group_by(timestamp,location) %>% 
    dplyr::summarize(Frequency = sum(count_event),Money_cd=sum(price))
head(aggregate_cc)
# A tibble: 6 x 4
# Groups:   timestamp [1]
  timestamp  location               Frequency Money_cd
  <date>     <chr>                      <dbl>    <dbl>
1 2014-01-06 Abila Airport                  4   7803. 
2 2014-01-06 Abila Zacharo                  6    380. 
3 2014-01-06 Albert's Fine Clothing         2    399. 
4 2014-01-06 Bean There Done That           5     44.4
5 2014-01-06 Brew've Been Served           16    185. 
6 2014-01-06 Brewed Awakenings              3     27.0
Adjustment of Date Type and create a new column named“Day”
aggregate_dataset$timestamp <- as.Date(aggregate_dataset$timestamp, "%m/%d/%Y")

aggregate_dataset$Day <- format(aggregate_dataset$timestamp, format="%d")
aggregate_cc$Day <- format(aggregate_cc$timestamp, format="%d")
head(aggregate_dataset)
# A tibble: 6 x 5
# Groups:   timestamp [1]
  timestamp  location               Frequency Money_loyalty Day  
  <date>     <chr>                      <dbl>         <dbl> <chr>
1 2014-01-10 Abila Zacharo                  7         171.  10   
2 2014-01-10 Albert's Fine Clothing         1         126.  10   
3 2014-01-10 Bean There Done That           5          60.7 10   
4 2014-01-10 Brew've Been Served           14         132.  10   
5 2014-01-10 Brewed Awakenings              3          33.9 10   
6 2014-01-10 Carlyle Chemical Inc.          2        3717.  10   
head(aggregate_cc)
# A tibble: 6 x 5
# Groups:   timestamp [1]
  timestamp  location               Frequency Money_cd Day  
  <date>     <chr>                      <dbl>    <dbl> <chr>
1 2014-01-06 Abila Airport                  4   7803.  06   
2 2014-01-06 Abila Zacharo                  6    380.  06   
3 2014-01-06 Albert's Fine Clothing         2    399.  06   
4 2014-01-06 Bean There Done That           5     44.4 06   
5 2014-01-06 Brew've Been Served           16    185.  06   
6 2014-01-06 Brewed Awakenings              3     27.0 06   
new column: text for tooltip
aggregate_dataset <- aggregate_dataset %>%
  mutate(text = paste0("Location: ", location, "\n", "Day of January: ", Day, "\n", "Frequency: ",Frequency))

aggregate_cc <- aggregate_cc %>%
  mutate(text2 = paste0("Location: ", location, "\n", "Day of January: ", Day, "\n", "Frequency: ",Frequency))

2.1.2 Data Visualization

Heat map for loyalty card usage frequency per day
p <- ggplot(data = aggregate_dataset, aes(x=Day, y=location,fill=Frequency,text=text)) + 
  geom_tile() +
  scale_fill_gradient(low="light BLUE", high="black") +
  theme_ipsum()

p <- p + theme(axis.text.y = element_text(size = 8))

ggplotly(p, tooltip="text")
Heat map for credit_debit card usage frequency per day
z <- ggplot(data = aggregate_cc, aes(x=Day, y=location,fill=Frequency,text=text2)) + 
  geom_tile() +
  scale_fill_gradient(low="light yellow", high="red") +
  theme_ipsum()

z <- z + theme(axis.text.y = element_text(size = 8))

ggplotly(z, tooltip="text2")

2.1.3 Infer and Analysis

Based on two heat maps, we can infer that the most popular places from Ja.06 to Jan.19 are Brew’ve Been Served and Katerina’s Cafe,since the color of heat maps are the most dark in these two places,but it is still not very obvious, we need to see more clearly.

And from the tooltips, we can also see some difference between the frequencies of these two types of card usage, which are abnormal.

So the next step is to build up new data frame to see the difference of cost record and frequency difference between these two types of cards more obviously.

loyalty_money <- aggregate_dataset %>% group_by(Day,location) %>% dplyr::summarise(max_loyal=max(Money_loyalty),freq_loyal=sum(Frequency))

cc_money <- aggregate_cc %>% group_by(Day,location) %>% dplyr::summarise(max_cc = max(Money_cd),freq_cc=sum(Frequency))
Comparison <- full_join(cc_money, loyalty_money, by = c('Day','location'))
Comparison[is.na(Comparison)] <- 0

datatable(Comparison,rownames = FALSE)
Result1 <- Comparison  %>%
dplyr::group_by(Day) %>%
filter(freq_cc == max(freq_cc)) %>%
arrange(desc(Day))

datatable(Result1,rownames = FALSE)

From the new data frame “Result1”, Now we can see that Katerina’s Cafe is the most popular palce based on credit and debit card record.

And we can also detect several anomalies based on these summary records.

1.The frequency usage of credit_debit card does not equal to that of loyalty card.

2.The money cost of credit_debit card does not equal to that of loyalty card.

From these abnormal records,

Another thoughts for Q1 Visualization Compared with Heat map, do we have better ways to visual, how about design line chart based on time period for the differences of money….

2.2 Q2 Intruoduction

Add the vehicle data to your analysis of the credit and loyalty card data. How does your assessment of the anomalies in question 1 change based on this new data? What discrepancies between vehicle, credit, and loyalty card data do you find? Please limit your answer to 8 images and 500 words.

2.2.1 Data Preparation for Q2

First,“MC2-tourist.jpg” is imported for data preparation.

bgmap <- raster("Data/MC2-tourist.tif")
bgmap
class      : RasterLayer 
band       : 1  (of  3  bands)
dimensions : 1595, 2706, 4316070  (nrow, ncol, ncell)
resolution : 3.16216e-05, 3.16216e-05  (x, y)
extent     : 24.82419, 24.90976, 36.04499, 36.09543  (xmin, xmax, ymin, ymax)
crs        : +proj=longlat +datum=WGS84 +no_defs 
source     : MC2-tourist.tif 
names      : MC2.tourist 
values     : 0, 255  (min, max)
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1,g = 2,b = 3,
       alpha = NA,
       saturation = 1,
       interpolate = TRUE,
       max.value = 255)

Abila_st <- st_read(dsn = "Data/Geospatial",
                    layer = "Abila")
Reading layer `Abila' from data source 
  `C:\linanyaogaibian\Dataviz_blog\_posts\2021-07-13-assignment\Data\Geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 3290 features and 9 fields
Geometry type: LINESTRING
Dimension:     XY
Bounding box:  xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
Geodetic CRS:  WGS 84
glimpse(GPS)
Rows: 685,169
Columns: 4
$ Timestamp <chr> "01/06/2014 06:28:01", "01/06/2014 06:28:01", "01/~
$ id        <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat       <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long      <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
GPS$Timestamp <- strptime(GPS$Timestamp, "%m/%d/%Y %H:%M:%S")
GPS$day <- as.factor(get_day(GPS$Timestamp))
GPS$id <- as_factor(GPS$id)
glimpse(GPS)
Rows: 685,169
Columns: 5
$ Timestamp <dttm> 2014-01-06 06:28:01, 2014-01-06 06:28:01, 2014-01~
$ id        <fct> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat       <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long      <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
$ day       <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,~

这一步起应该与老师有所不同

GPS_sf <- st_as_sf(GPS, 
                   coords = c("long", "lat"),
                       crs= 4326)
GPS_sf
Simple feature collection with 685169 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 24.82509 ymin: 36.04802 xmax: 24.90849 ymax: 36.08996
Geodetic CRS:  WGS 84
# A tibble: 685,169 x 4
   Timestamp           id    day              geometry
 * <dttm>              <fct> <fct>         <POINT [°]>
 1 2014-01-06 06:28:01 35    6     (24.87469 36.07623)
 2 2014-01-06 06:28:01 35    6      (24.8746 36.07622)
 3 2014-01-06 06:28:03 35    6     (24.87444 36.07621)
 4 2014-01-06 06:28:05 35    6     (24.87425 36.07622)
 5 2014-01-06 06:28:06 35    6     (24.87417 36.07621)
 6 2014-01-06 06:28:07 35    6     (24.87406 36.07619)
 7 2014-01-06 06:28:09 35    6     (24.87391 36.07619)
 8 2014-01-06 06:28:10 35    6     (24.87381 36.07618)
 9 2014-01-06 06:28:11 35    6     (24.87374 36.07617)
10 2014-01-06 06:28:12 35    6     (24.87362 36.07618)
# ... with 685,159 more rows
gps_path <- GPS_sf %>%
  group_by(id, day) %>%
  summarize(m = mean(Timestamp), 
            do_union=FALSE) %>%
  st_cast("LINESTRING")
gps_path
Simple feature collection with 508 features and 3 fields
Geometry type: LINESTRING
Dimension:     XY
Bounding box:  xmin: 24.82509 ymin: 36.04802 xmax: 24.90849 ymax: 36.08996
Geodetic CRS:  WGS 84
# A tibble: 508 x 4
# Groups:   id [40]
   id    day   m                                              geometry
   <fct> <fct> <dttm>                                 <LINESTRING [°]>
 1 1     6     2014-01-06 15:02:08 (24.88258 36.06646, 24.88259 36.06~
 2 1     7     2014-01-07 12:41:07 (24.87957 36.04803, 24.87957 36.04~
 3 1     8     2014-01-08 14:35:25 (24.88265 36.06643, 24.88266 36.06~
 4 1     9     2014-01-09 12:04:45 (24.88261 36.06646, 24.88257 36.06~
 5 1     10    2014-01-10 16:04:58 (24.88265 36.0665, 24.88261 36.066~
 6 1     11    2014-01-11 16:18:32 (24.88258 36.06651, 24.88246 36.06~
 7 1     12    2014-01-12 13:31:05 (24.88259 36.06643, 24.8824 36.066~
 8 1     13    2014-01-13 13:46:15 (24.88265 36.06642, 24.8826 36.066~
 9 1     14    2014-01-14 14:04:23 (24.88261 36.06644, 24.88262 36.06~
10 1     15    2014-01-15 15:33:54 (24.88263 36.06647, 24.88257 36.06~
# ... with 498 more rows
gps_path_selected <- gps_path %>%
  filter(id==5)
tmap_mode("view")
tm_shape(bgmap) +
  tm_rgb(bgmap, r = 1,g = 2,b = 3,
       alpha = NA,
       saturation = 1,
       interpolate = TRUE,
       max.value = 255) +
  tm_shape(gps_path_selected) +
  tm_lines()